import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as st
st.use("ggplot")
df=pd.read_csv("newJordan.csv",usecols=["phone","religion","birthday","first","gender","lang","email"])
df.drop_duplicates(inplace=True)
df.groupby("gender")["gender"].count()
gender female 977629 male 2053416 Name: gender, dtype: int64
df.groupby("gender")["gender"].count().plot(kind="bar",color=("purple","blue"),edgecolor="black")
<AxesSubplot:xlabel='gender'>
df.phone=df.phone.astype(str)
df.phone.count()
3099842
counting each phone carrier in jordan
Orange=df[df.phone.str.startswith("96277")].phone.count()
Zain=df[df.phone.str.startswith("96279")].phone.count()
Umniah=df[df.phone.str.startswith("96278")].phone.count()
df.loc[df.phone.str.startswith("96279"),"phone"]="Zain"
df.loc[df.phone.str.startswith("96277"),"phone"]="Orange"
df.loc[df.phone.str.startswith("96278"),"phone"]="Umniah"
# df.replace(df[df.phone.str.startswith("96279")].phone,"Zain")
there are 45 wrong phone number
df[~df.phone.str.startswith(tuple(["96277","96279","96278"]))].phone.count()
3099842
and there are 3099794 correct phone number
df=df.replace({"religion":[".*(uman|UMAN).*"]},"Humanism",regex=True)
Orange+Umniah+Zain
3099797
a pie chart of the three phone carriers in jordan
plt.gcf().set_size_inches(8,8)
plt.pie([Zain,Umniah,Orange],labels=["Zain","Orange","Umniah"],colors=["blue","orange","green"],shadow=True,autopct="%1.1f%%")
plt.legend()
<matplotlib.legend.Legend at 0x7fecf722f4c0>
there are 11667 unique religion valuea, not unique really, just the spelling or the languge used
df.religion.value_counts()
الاسلام 6531 Muslim 5257 Muslim - Sunni 4168 Islam 4113 مسلم 3580 ... مسلم وراسي مرفوع 1 Other (كلي جروح 1 Muslim - Sunni █████████ 100% ... I <3 God 1 Other (معتقداتي انسانية 1 المسيحيه (في البداء كان الكلمه ولكلمه كان عند الله وكلم 1 Name: religion, Length: 11638, dtype: int64
df=df.replace({"religion":[".*(سن|Islam|isl|Musulman|sl|Ислам|SL|Sl|سل|مح|محم).*"]},"Muslim",regex=True)
df=df.replace({"religion":[".*(Христианство|sih|seh|cristian|atholic|ch|CH|Ch|مسي).*"]},"Christian",regex=True)
df=df.replace({"religion":[".*(God|god|GOD|Other|other).*"]},"Other",regex=True)
df=df.replace({"religion":[".*(الله|م|ﺍﻟ|isalam|ﺍﻟﻠﻪ|Isalm|allah|ﻣﺴﻠﻢ|Allah|isalm|لل|الا).*"]},"Muslim",regex=True)
df=df.replace({"religion":[".*(indu|udd).*"]},"Hindu & Buddhist",regex=True)
df=df.replace({"religion":[".*(uman|UMAN).*"]},"Humanism",regex=True)
df=df.replace({"religion":[".*(ruze|urzi).*"]},"Druze",regex=True)
df=df.replace({"religion":[".*(ecular|gnostic|Ath|None|none|أدري|no|No|NON|non|NO).*"]},"Non believer",regex=True)
the below script shows that the unique values have reduced from 11667 to 421 with around 400 contributing to 1.5% of total records. these 1.5% are different spelling, languages and phrases.
df.religion.value_counts()
Muslim 55734 Christian 1751 Other 588 Non believer 140 Humanism 42 ... durze () 1 Memes () 1 00 1 Neutral () 1 Metal \m/ 1 Name: religion, Length: 424, dtype: int64
df.religion.value_counts(normalize=True)[:10]*100
Muslim 94.837326 Christian 2.979513 Other 1.000545 Non believer 0.238225 Humanism 0.071467 Hindu & Buddhist 0.035734 Druze 0.030629 What are your religious beliefs? 0.010210 ا 0.010210 musilm () 0.006806 Name: religion, dtype: float64
df[df.religion.notna()].religion.value_counts()[0:7].plot(figsize=(10,5),kind="bar")
<AxesSubplot:>
logarithmic scale, graph could be deceiving, islam has 94.853% of the not nan religion values
df[df.religion.notna()].religion.value_counts()[0:7].plot(figsize=(10,5),kind="bar",color="blue",edgecolor="black")
plt.yscale("log")
df["lang"].value_counts()[0:20].plot(figsize=(10,5),kind="bar")
<AxesSubplot:>
df["lang"].value_counts(normalize=True)[0:4].plot(figsize=(10,5),kind="bar",color="blue",edgecolor="black")
plt.legend()
plt.yscale("log")
df["lang"].value_counts()[4:20].plot(figsize=(10,5),kind="bar",edgecolor="black")
plt.legend()
<matplotlib.legend.Legend at 0x7fecbb916dc0>
top 20 most common names as they are without cleaning, mohamed for example appeared 3 times in top 20, two times written in different ways in english and once in arabic. and some are not considered first names more like part of a nickname, in arabic fathers are called with "father of eldest son name"; in the data first name is roughly translated "father of" ... "mother of" is in top 20 also
df["first"].value_counts()[:20].plot(figsize=(10,10),kind="pie")
# df.religion.value_counts()
<AxesSubplot:ylabel='first'>
dateSeries=df.birthday.dropna()
strings without year were suffexed by the year 2020 because there is a leap year date "02/29"
one=dateSeries[~dateSeries.str.contains(r"\d\d\d")]+"/2020"
two=dateSeries[dateSeries.str.contains(r"\d\d\d")]
concatenate the modified dates with the ones with year
dates=pd.concat([two,one],axis=0)
dates=pd.to_datetime(dates)
dates.dt.month.value_counts().sort_index().plot(figsize=(10,5),kind="bar",color="cyan",edgecolor="black")
<AxesSubplot:>
decided to not include the dates that were missing years
dates.dt.year.value_counts()[1:].sort_index().plot(figsize=(20,10),kind="bar",color="b")
<AxesSubplot:>
two=pd.to_datetime(two)
dat=two.dt.month.groupby(two.dt.day).value_counts().unstack()
As shown in the heatmap there was a preference for birthdays with repeted numbers like 1/1 2/2 3/3 .... because of the way they sound including the first of each month and the end of the year
with less preference but still noticable on 21/1 , 22/2 , 23/3 , 24/4 , ....
Feb 14 : valentines's Day, Nov 14 : children's Day, Mar 21 : Mother's Day
fig,ax=plt.subplots(figsize=(12,18))
sns.heatmap(dat,cmap="RdBu_r",vmin=50,annot=True,fmt=".0f",linewidths=1,linecolor="black")
plt.ylabel("day")
plt.xlabel("month")
ax.xaxis.set_ticks_position("top")
ax.xaxis.set_label_position("top")
emails=df.email.dropna()
provider=emails.str.extract(r"@(.*?)\.",expand=False).value_counts()[:10]
fig,ax=plt.subplots(figsize=(12,5))
# plt.text(20,20,"eggs",ha="right")
# provider.plot(,kind="bar",color="black")
# plt.xticks(rotation=90)
bars=ax.barh(provider.index,provider.values,color="yellow",edgecolor="black")
ax.bar_label(bars,padding=5)
# ax.invert_yaxis()
plt.title("count of top 10 email providers")
Text(0.5, 1.0, 'count of top 10 email providers')
df.loc[df.email.str.contains(r"windowslive",na=False),"email"]="windowslive"
df.loc[df.email.str.contains(r"yahoo",na=False),"email"]="yahoo"
for i in provider.index:
df.loc[df.email.str.contains(f"@{i}\.",na=False),"email"]=f"{i}"
df.email.value_counts()[:10]
yahoo 12922 hotmail 5983 gmail 3721 live 419 ymail 208 outlook 181 windowslive 119 mail 79 icloud 36 rocketmail 34 Name: email, dtype: int64
provider
yahoo 12920 hotmail 5983 gmail 3723 live 419 ymail 208 outlook 181 windowslive 119 mail 79 icloud 36 rocketmail 34 Name: email, dtype: int64
df
phone | religion | birthday | first | gender | lang | ||
---|---|---|---|---|---|---|---|
0 | NaN | Zain | Muslim | 02/23/1986 | احمد ظاهر | male | en_GB |
1 | NaN | Zain | NaN | NaN | Ahmad | male | en_US |
2 | NaN | Zain | NaN | 06/17 | AlHawi | male | en_GB |
3 | NaN | Zain | NaN | NaN | Abed | male | ar_AR |
4 | NaN | Zain | NaN | NaN | Hassan | male | ar_AR |
... | ... | ... | ... | ... | ... | ... | ... |
3105977 | NaN | 96232312412 | Muslim | 02/21/1990 | سالم | male | ar_AR |
3105978 | y.asaidat@pra.gov.jo | 96232156044 | NaN | 08/20/1975 | Yaseen | male | ar_AR |
3105979 | NaN | 96227381601 | Muslim | NaN | بتهمنا | male | ar_AR |
3105980 | yahoo | 96227243300 | NaN | NaN | Kamal | male | ar_AR |
3105981 | NaN | 96226210443 | Muslim | 10/02/1984 | وليد | male | ar_AR |
3099842 rows × 7 columns
# ddata=newd[["religion","birthday","gender","lang","first"]]
# ddata.count()
# ddata.to_csv("modifiedJO.csv",index=False)
# df.to_csv("data.csv",index=False)
# dates.to_csv("2.csv",index=False)
# provider.to_csv("emails.csv")